{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:51:39.255565Z",
     "start_time": "2020-08-22T13:51:34.696433Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "from datetime import datetime, timedelta\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy import stats\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "\n",
    "from tqdm import tqdm\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "import gc\n",
    "import os\n",
    "\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from gensim.models import Word2Vec\n",
    "from sklearn.feature_selection import SelectPercentile, f_classif, chi2\n",
    "\n",
    "warnings.simplefilter('ignore')\n",
    "tqdm.pandas()\n",
    "%matplotlib inline\n",
    "\n",
    "pd.set_option('max_columns', None)\n",
    "pd.set_option('max_rows', None)\n",
    "pd.set_option('max_colwidth', 200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:51:39.872512Z",
     "start_time": "2020-08-22T13:51:39.866832Z"
    }
   },
   "outputs": [],
   "source": [
    "seed = 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:51:49.252695Z",
     "start_time": "2020-08-22T13:51:39.874572Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "df_train_label = pd.read_csv('raw_data/train/train_label.csv')\n",
    "df_train_base = pd.read_csv('raw_data/train/train_base.csv')\n",
    "df_train_trans = pd.read_csv('raw_data/train/train_trans.csv')\n",
    "df_train_op = pd.read_csv('raw_data/train/train_op.csv')\n",
    "\n",
    "df_test_base = pd.read_csv('raw_data/test_b/testb_base.csv')\n",
    "df_test_trans = pd.read_csv('raw_data/test_b/testb_trans.csv')\n",
    "df_test_op = pd.read_csv('raw_data/test_b/testb_op.csv')\n",
    "\n",
    "df_trans = df_train_trans.append(df_test_trans)\n",
    "df_trans = df_trans.reset_index(drop=True)\n",
    "\n",
    "df_op = df_train_op.append(df_test_op)\n",
    "df_op = df_op.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:35.471623Z",
     "start_time": "2020-08-22T13:51:49.253987Z"
    }
   },
   "outputs": [],
   "source": [
    "def parse_time(tm):\n",
    "    days, _, time = tm.split(' ')\n",
    "    time = time.split('.')[0]\n",
    "\n",
    "    time = '2020-1-1 ' + time\n",
    "    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')\n",
    "    time = (time + timedelta(days=int(days)))\n",
    "\n",
    "    return time\n",
    "\n",
    "\n",
    "df_trans['date'] = df_trans['tm_diff'].apply(parse_time)\n",
    "df_trans['day'] = df_trans['date'].dt.day\n",
    "df_trans['hour'] = df_trans['date'].dt.hour\n",
    "\n",
    "df_op['date'] = df_op['tm_diff'].apply(parse_time)\n",
    "df_op['day'] = df_op['date'].dt.day\n",
    "df_op['hour'] = df_op['date'].dt.hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.151656Z",
     "start_time": "2020-08-22T13:52:35.472712Z"
    }
   },
   "outputs": [],
   "source": [
    "df_trans.sort_values(['user', 'date'], inplace=True)\n",
    "df_trans = df_trans.reset_index(drop=True)\n",
    "\n",
    "df_op.sort_values(['user', 'date'], inplace=True)\n",
    "df_op = df_op.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.218564Z",
     "start_time": "2020-08-22T13:52:38.153272Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train_base.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.412613Z",
     "start_time": "2020-08-22T13:52:38.219765Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train = df_train_base.merge(df_train_label, how='left')\n",
    "df_test = df_test_base\n",
    "\n",
    "df_feature = df_train.append(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.418115Z",
     "start_time": "2020-08-22T13:52:38.413659Z"
    }
   },
   "outputs": [],
   "source": [
    "def select_feature(df, select_feature, ycol, p):\n",
    "    X = df[select_feature]\n",
    "    X.fillna(0, inplace=True)\n",
    "    Y = df[ycol]\n",
    "\n",
    "    selectChi2 = SelectPercentile(chi2, percentile=p).fit(X, Y)\n",
    "    selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, Y)\n",
    "\n",
    "    chi2_selected = selectChi2.get_support()\n",
    "    print('Chi2 selected {} features.'.format(chi2_selected.sum()))\n",
    "    f_classif_selected = selectF_classif.get_support()\n",
    "    print('F_classif selected {} features.'.format(f_classif_selected.sum()))\n",
    "    selected = chi2_selected & f_classif_selected\n",
    "    print('Chi2 & F_classif selected {} features'.format(selected.sum()))\n",
    "    selected_features = [f for f, s in zip(select_feature, selected) if s]\n",
    "\n",
    "    del_features = list(set(select_feature) - set(selected_features))\n",
    "    del_features.sort()\n",
    "    return del_features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-20T02:42:35.266931Z",
     "start_time": "2020-07-20T02:42:35.263531Z"
    }
   },
   "source": [
    "# 特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.443323Z",
     "start_time": "2020-08-22T13:52:38.419060Z"
    },
    "code_folding": [
     4
    ]
   },
   "outputs": [],
   "source": [
    "os.makedirs('model', exist_ok=True)\n",
    "os.makedirs('embedding', exist_ok=True)\n",
    "\n",
    "\n",
    "def w2v_emb(df, f1, f2):\n",
    "    emb_size = 16\n",
    "\n",
    "    model_path = 'model/w2v_{}_{}_{}.m'.format(f1, f2, emb_size)\n",
    "    embedding_path = 'embedding/{}_{}_{}.pkl'.format(f1, f2, emb_size)\n",
    "\n",
    "    if os.path.exists(embedding_path):\n",
    "        embedding = pd.read_pickle(embedding_path)\n",
    "        return embedding\n",
    "\n",
    "    tmp = df.groupby(f1, as_index=False)[f2].agg(\n",
    "        {'{}_{}_list'.format(f1, f2): list})\n",
    "\n",
    "    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()\n",
    "    del tmp['{}_{}_list'.format(f1, f2)]\n",
    "    for i in range(len(sentences)):\n",
    "        sentences[i] = [str(x) for x in sentences[i]]\n",
    "\n",
    "    if os.path.exists(model_path):\n",
    "        model = Word2Vec.load(model_path)\n",
    "    else:\n",
    "        model = Word2Vec(sentences,\n",
    "                         size=emb_size,\n",
    "                         window=5,\n",
    "                         min_count=5,\n",
    "                         sg=0,\n",
    "                         hs=1,\n",
    "                         seed=seed)\n",
    "        model.save(model_path)\n",
    "\n",
    "    emb_matrix = []\n",
    "    for seq in tqdm(sentences):\n",
    "        vec = []\n",
    "        for w in seq:\n",
    "            if w in model:\n",
    "                vec.append(model[w])\n",
    "        if len(vec) > 0:\n",
    "            emb_matrix.append(np.mean(vec, axis=0))\n",
    "        else:\n",
    "            emb_matrix.append([0] * emb_size)\n",
    "\n",
    "    df_emb = pd.DataFrame(emb_matrix)\n",
    "    df_emb.columns = [\n",
    "        '{}_{}_emb_{}'.format(f1, f2, i) for i in range(emb_size)\n",
    "    ]\n",
    "\n",
    "    embedding = pd.concat([tmp, df_emb], axis=1)\n",
    "    embedding.to_pickle(embedding_path)\n",
    "\n",
    "    return embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.503777Z",
     "start_time": "2020-08-22T13:52:38.445509Z"
    },
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "def tfidf_emb(df, f1, f2):\n",
    "    emb_size = 10\n",
    "\n",
    "    df[f2] = df[f2].astype(str)\n",
    "    df[f2].fillna('-1', inplace=True)\n",
    "    group_df = df.groupby([f1]).apply(\n",
    "        lambda x: x[f2].tolist()).reset_index()\n",
    "    group_df.columns = [f1, 'list']\n",
    "    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))\n",
    "    enc_vec = TfidfVectorizer()\n",
    "    tfidf_vec = enc_vec.fit_transform(group_df['list'])\n",
    "    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)\n",
    "    vec_svd = svd_enc.fit_transform(tfidf_vec)\n",
    "    vec_svd = pd.DataFrame(vec_svd)\n",
    "    vec_svd.columns = ['svd_tfidf_{}_{}'.format(\n",
    "        f2, i) for i in range(emb_size)]\n",
    "    group_df = pd.concat([group_df, vec_svd], axis=1)\n",
    "    del group_df['list']\n",
    "    return group_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.592403Z",
     "start_time": "2020-08-22T13:52:38.505932Z"
    }
   },
   "outputs": [],
   "source": [
    "def countvec_emb(df, f1, f2):\n",
    "    emb_size = 10\n",
    "\n",
    "    df[f2] = df[f2].astype(str)\n",
    "    df[f2].fillna('-1', inplace=True)\n",
    "    group_df = df.groupby([f1]).apply(\n",
    "        lambda x: x[f2].tolist()).reset_index()\n",
    "    group_df.columns = [f1, 'list']\n",
    "    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))\n",
    "    enc_vec = CountVectorizer()\n",
    "    tfidf_vec = enc_vec.fit_transform(group_df['list'])\n",
    "    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)\n",
    "    vec_svd = svd_enc.fit_transform(tfidf_vec)\n",
    "    vec_svd = pd.DataFrame(vec_svd)\n",
    "    vec_svd.columns = ['svd_countvec_{}_{}'.format(\n",
    "        f2, i) for i in range(emb_size)]\n",
    "    group_df = pd.concat([group_df, vec_svd], axis=1)\n",
    "    del group_df['list']\n",
    "    return group_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 操作信息特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:38.632229Z",
     "start_time": "2020-08-22T13:52:38.594067Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_op.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:52:52.349871Z",
     "start_time": "2020-08-22T13:52:38.633245Z"
    }
   },
   "outputs": [],
   "source": [
    "for f1, f2 in [['user', 'op_mode'], ['user', 'op_type']]:\n",
    "    df_feature = df_feature.merge(tfidf_emb(df_op, f1, f2), on=f1, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:05.836909Z",
     "start_time": "2020-08-22T13:52:52.351029Z"
    }
   },
   "outputs": [],
   "source": [
    "for f1, f2 in [['user', 'op_mode'], ['user', 'op_type']]:\n",
    "    df_feature = df_feature.merge(\n",
    "        countvec_emb(df_op, f1, f2), on=f1, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:07.675468Z",
     "start_time": "2020-08-22T13:53:05.838018Z"
    }
   },
   "outputs": [],
   "source": [
    "for col in ['channel', 'op_mode']:\n",
    "    df_temp = df_op[['user', col]].copy()\n",
    "    df_temp['tmp'] = 1\n",
    "    df_temp = df_temp.pivot_table(index='user', columns=col,\n",
    "                                  values='tmp', aggfunc=np.sum).reset_index().fillna(0)\n",
    "    df_temp.columns = [c if c == 'user' else 'op_{}_{}_count'.format(\n",
    "        col, c) for c in df_temp.columns]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:08.799574Z",
     "start_time": "2020-08-22T13:53:07.676590Z"
    }
   },
   "outputs": [],
   "source": [
    "for col in ['op_type']:\n",
    "    df_temp = df_op[['user', 'hour', col]].copy()\n",
    "    df_temp = df_temp.pivot_table(index='user', columns=col,\n",
    "                                  values='hour', aggfunc=np.mean).reset_index().fillna(0)\n",
    "    df_temp.columns = [c if c == 'user' else 'op_{}_{}_hour_mean'.format(\n",
    "        col, c) for c in df_temp.columns]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:09.449627Z",
     "start_time": "2020-08-22T13:53:08.800985Z"
    }
   },
   "outputs": [],
   "source": [
    "df_temp = df_op.groupby(['user', 'op_device']).size().reset_index()\n",
    "df_temp.drop([0], axis=1, inplace=True)\n",
    "df_temp = df_temp.sort_values(\n",
    "    by=['user', 'op_device'], ascending=['asc', 'asc'])\n",
    "df_temp.drop_duplicates('user', keep='last', inplace=True)\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:10.175788Z",
     "start_time": "2020-08-22T13:53:09.451277Z"
    }
   },
   "outputs": [],
   "source": [
    "df_temp = df_op.groupby('user')['hour'].agg(\n",
    "    op_hour_std='std').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')\n",
    "\n",
    "df_temp = df_op.groupby('user')['day'].agg(\n",
    "    op_day_std='std', op_day_max='max').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:26.216684Z",
     "start_time": "2020-08-22T13:53:10.176866Z"
    }
   },
   "outputs": [],
   "source": [
    "df_op['date_diff'] = df_op.groupby('user')['date'].diff()\n",
    "df_op['op_second_diff'] = df_op['date_diff'].dt.seconds\n",
    "df_op['op_hour_diff'] = df_op['op_second_diff'] / 3600\n",
    "df_op['op_day_diff'] = df_op['op_hour_diff'] / 24\n",
    "\n",
    "df_temp = df_op.groupby('user')['op_second_diff'].agg(\n",
    "    op_second_diff_mean='mean').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:26.460271Z",
     "start_time": "2020-08-22T13:53:26.218319Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 交易信息特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:26.473822Z",
     "start_time": "2020-08-22T13:53:26.462478Z"
    }
   },
   "outputs": [],
   "source": [
    "df_trans.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:26.933874Z",
     "start_time": "2020-08-22T13:53:26.474957Z"
    }
   },
   "outputs": [],
   "source": [
    "for f1, f2 in [['user', 'amount']]:\n",
    "    df_feature = df_feature.merge(w2v_emb(df_trans, f1, f2), on=f1, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:29.619235Z",
     "start_time": "2020-08-22T13:53:26.935319Z"
    },
    "code_folding": [],
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for col in ['platform', 'type1', 'type2', 'hour']:\n",
    "    df_temp = df_trans.pivot_table(\n",
    "        index='user', columns=col, values='amount', aggfunc='sum').reset_index()\n",
    "    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_sum'.format(\n",
    "        col, c) for c in df_temp.columns]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')\n",
    "\n",
    "for col in ['type1', 'hour']:\n",
    "    df_temp = df_trans.pivot_table(\n",
    "        index='user', columns=col, values='amount', aggfunc='mean').reset_index()\n",
    "    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_mean'.format(\n",
    "        col, c) for c in df_temp.columns]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')\n",
    "\n",
    "for col in ['type1']:\n",
    "    df_temp = df_trans.pivot_table(\n",
    "        index='user', columns=col, values='amount', aggfunc='max').reset_index()\n",
    "    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_max'.format(\n",
    "        col, c) for c in df_temp.columns]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')\n",
    "\n",
    "for col in ['type1']:\n",
    "    df_temp = df_trans.pivot_table(\n",
    "        index='user', columns=col, values='amount', aggfunc='min').reset_index()\n",
    "    df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_min'.format(\n",
    "        col, c) for c in df_temp.columns]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:30.930160Z",
     "start_time": "2020-08-22T13:53:29.620494Z"
    }
   },
   "outputs": [],
   "source": [
    "for window in [31, 3, 5, 10, 15]:\n",
    "    df_temp = df_trans[df_trans['day'] > 31-window].groupby('user')['amount'].agg({\n",
    "        'trans_amount_mean_{}d'.format(window): 'mean',\n",
    "        'trans_amount_std_{}d'.format(window): 'std',\n",
    "        'trans_amount_max_{}d'.format(window): 'max',\n",
    "        'trans_amount_min_{}d'.format(window): 'min',\n",
    "        'trans_amount_sum_{}d'.format(window): 'sum',\n",
    "    }).reset_index()\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:31.732633Z",
     "start_time": "2020-08-22T13:53:30.931317Z"
    }
   },
   "outputs": [],
   "source": [
    "for window in [3, 5, 10]:\n",
    "    for col in ['type1']:\n",
    "        df_temp = df_trans[df_trans['day'] > 31-window].pivot_table(\n",
    "            index='user', columns=col, values='amount', aggfunc='sum').reset_index()\n",
    "        df_temp.columns = [c if c == 'user' else 'trans_{}_{}_amount_sum_{}d'.format(\n",
    "            col, c, window) for c in df_temp.columns]\n",
    "        df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:32.218962Z",
     "start_time": "2020-08-22T13:53:31.733722Z"
    }
   },
   "outputs": [],
   "source": [
    "df_temp = df_trans.groupby(['user'])['ip'].agg(\n",
    "    trans_ip_count='count').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')\n",
    "\n",
    "df_temp = df_trans.groupby(['user'])['ip_3'].agg(\n",
    "    trans_ip_3_count='count').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:32.445012Z",
     "start_time": "2020-08-22T13:53:32.220036Z"
    }
   },
   "outputs": [],
   "source": [
    "df_temp = df_trans.groupby('user')['hour'].agg(\n",
    "    trans_hour_std='std').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:47.231839Z",
     "start_time": "2020-08-22T13:53:32.446278Z"
    }
   },
   "outputs": [],
   "source": [
    "df_trans['date_diff'] = df_trans.groupby('user')['date'].diff()\n",
    "df_trans['trans_second_diff'] = df_trans['date_diff'].dt.seconds\n",
    "df_trans['trans_hour_diff'] = df_trans['trans_second_diff'] / 3600\n",
    "df_trans['trans_day_diff'] = df_trans['trans_hour_diff'] / 24\n",
    "\n",
    "df_temp = df_trans.groupby('user')['trans_day_diff'].agg(\n",
    "    trans_day_diff_mean='mean', trans_day_diff_std='std').reset_index()\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:47.597394Z",
     "start_time": "2020-08-22T13:53:47.233273Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-20T08:57:03.376937Z",
     "start_time": "2020-07-20T08:57:03.373425Z"
    }
   },
   "source": [
    "## 基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:48.103245Z",
     "start_time": "2020-08-22T13:53:47.598684Z"
    }
   },
   "outputs": [],
   "source": [
    "for f in [\n",
    "        'balance', 'balance_avg', 'balance1', 'balance1_avg', 'balance2',\n",
    "        'balance2_avg', 'product1_amount', 'product2_amount',\n",
    "        'product3_amount', 'product4_amount', 'product5_amount', 'product6_amount'\n",
    "]:\n",
    "    df_feature[f] = df_feature[f].apply(lambda x: int(\n",
    "        x.split(' ')[1]) if type(x) != float else np.NaN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:48.115821Z",
     "start_time": "2020-08-22T13:53:48.104506Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['product7_fail_ratio'] = df_feature[\n",
    "    'product7_fail_cnt'] / df_feature['product7_cnt']\n",
    "df_feature['card_cnt'] = df_feature['card_a_cnt'] + df_feature[\n",
    "    'card_b_cnt'] + df_feature['card_c_cnt'] + df_feature['card_d_cnt']\n",
    "\n",
    "df_feature['acc_card_ratio'] = df_feature['acc_count'] / df_feature['card_cnt']\n",
    "df_feature['login_cnt'] = df_feature['login_cnt_period1'] + \\\n",
    "    df_feature['login_cnt_period2']\n",
    "\n",
    "df_feature['login_cnt_period2_login_cnt_ratio'] = df_feature['login_cnt_period2'] / \\\n",
    "    df_feature['login_cnt']\n",
    "df_feature['login_cnt_period1_login_cnt_ratio'] = df_feature['login_cnt_period1'] / \\\n",
    "    df_feature['login_cnt']\n",
    "\n",
    "df_feature['using_time_op2_cnt_ratio'] = df_feature['using_time'] / \\\n",
    "    df_feature['op2_cnt']\n",
    "df_feature['using_time_op1_cnt_ratio'] = df_feature['using_time'] / \\\n",
    "    df_feature['op1_cnt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:53:56.141729Z",
     "start_time": "2020-08-22T13:53:48.117158Z"
    },
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "# 欺诈率\n",
    "def stat(df, df_merge, group_by, agg):\n",
    "    group = df.groupby(group_by).agg(agg)\n",
    "\n",
    "    columns = []\n",
    "    for on, methods in agg.items():\n",
    "        for method in methods:\n",
    "            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))\n",
    "    group.columns = columns\n",
    "    group.reset_index(inplace=True)\n",
    "    df_merge = df_merge.merge(group, on=group_by, how='left')\n",
    "\n",
    "    del (group)\n",
    "    gc.collect()\n",
    "    return df_merge\n",
    "\n",
    "\n",
    "def statis_feat(df_know, df_unknow):\n",
    "    df_unknow = stat(df_know, df_unknow, ['province'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['city'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'city', 'level'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['op_device'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'age', 'op_device'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['using_time'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'city', 'op_device'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['age', 'city'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'op_device', 'level'], {'label': ['mean']})\n",
    "\n",
    "    return df_unknow\n",
    "\n",
    "\n",
    "df_train = df_feature[~df_feature['label'].isnull()]\n",
    "df_train = df_train.reset_index(drop=True)\n",
    "df_test = df_feature[df_feature['label'].isnull()]\n",
    "\n",
    "df_stas_feat = None\n",
    "kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)\n",
    "for train_index, val_index in kf.split(df_train, df_train['label']):\n",
    "    df_fold_train = df_train.iloc[train_index]\n",
    "    df_fold_val = df_train.iloc[val_index]\n",
    "\n",
    "    df_fold_val = statis_feat(df_fold_train, df_fold_val)\n",
    "    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)\n",
    "\n",
    "    del (df_fold_train)\n",
    "    del (df_fold_val)\n",
    "    gc.collect()\n",
    "\n",
    "df_test = statis_feat(df_train, df_test)\n",
    "df_feature = pd.concat([df_stas_feat, df_test], axis=0)\n",
    "df_feature = df_feature.reset_index(drop=True)\n",
    "\n",
    "del (df_stas_feat)\n",
    "del (df_train)\n",
    "del (df_test)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-22T13:54:00.869017Z",
     "start_time": "2020-08-22T13:53:56.857131Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.to_pickle('data/feature2.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
